Note: hospdays records the number of days hospitalized after study enrollment. In another script, I test tothosp (the total number of hospitalized days). I try to control for hospitalization unrelated to infelunza by excluding patients with underlying conditions from the tothosp analysis

Notes on data formatting from Deborah Wentworth:

Visualize the relationship between age and duration of hospitalization

## 
##  Spearman's rank correlation rho
## 
## data:  valid$age and valid$hospdays
## S = 56530000, p-value = 2.296e-10
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##      rho 
## 0.227336

There is a positive correlation between age and duration of hospitalization.

Visualize the relationship between age and prob of H3N2 infection

## 
##  Spearman's rank correlation rho
## 
## data:  valid$age and valid$hospdays
## S = 59538000, p-value = 1.403e-07
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.1894313

Results show a positive correlation between duration of hosp and age.

Visualize the relationship between country and symptom duration

H1N1

H3N2

No clear effect of country

Visualize the relationship between season and symptom duration

H1N1

No clear effect of season.

H3N2

No clear effect of season –> Try models that don’t include season and country as blocking vars

H1N1 10-fold CV

We know the relationship between age and probability of infection should be non-linear. Use cross-validation to verify, and to choose the number of degrees of freedom to include in a spline

Plot the overall test error rate vs. df

No clear preference. Use 4 df?

H3N2 10-fold CV

We know the relationship between age and probability of infection should be non-linear. Use cross-validation to verify, and to choose the number of degrees of freedom to include in a spline

Plot the overall test error rate vs. df

Conclusion: Clear preference for linear fit for H3N2

Set up a logistic regression that includes the following predictors:

  • age
  • imprinting status
  • vaccination status
  • antiviral use
  • season
  • country

Clean data for fitting

## integer(0)
## integer(0)
## integer(0)
## integer(0)
## integer(0)
## integer(0)
## integer(0)
## 
## Argentina Australia   Austria   Belgium     Chile     China   Denmark 
##        50        69         3        11         1        15        67 
##   Germany    Greece    Norway      Peru    Poland Singapore     Spain 
##        41        81         7        15         6         6        40 
##  Thailand        UK       USA 
##        64       131       114
## [1] 721

First fit a model where the only independent variable is age

Treat vaccination, antivial use, underlying symptoms, country and season as blocking variables with fixed effects

library(splines)
## H1N1 fit to medical history only (no effect of country, season, age or imprinting)
fit00 = glm(hospdays ~ anyvac + anyav + anydx, data = train.H1N1)
summary(fit00)
## 
## Call:
## glm(formula = hospdays ~ anyvac + anyav + anydx, data = train.H1N1)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -9.111  -5.980  -3.703   0.297  90.471  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)    4.121      1.268   3.250  0.00122 **
## anyvac1       -2.131      1.036  -2.056  0.04016 * 
## anyav1         1.582      1.195   1.324  0.18590   
## anydx1         3.407      1.032   3.300  0.00102 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 128.5975)
## 
##     Null deviance: 84839  on 648  degrees of freedom
## Residual deviance: 82945  on 645  degrees of freedom
## AIC: 4999.8
## 
## Number of Fisher Scoring iterations: 2
## H1N1 fit to country, season and medical history only (no effect of age or imprinting)  
fit0 = glm(hospdays ~ anyvac + anyav + anydx + country + season, data = train.H1N1)
summary(fit0)
## 
## Call:
## glm(formula = hospdays ~ anyvac + anyav + anydx + country + season, 
##     data = train.H1N1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -15.839   -5.600   -2.717    0.785   91.142  
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)   
## (Intercept)        5.2459     3.2847   1.597  0.11076   
## anyvac1           -1.6792     1.0896  -1.541  0.12379   
## anyav1             1.9835     1.2191   1.627  0.10425   
## anydx1             2.3804     1.0585   2.249  0.02487 * 
## countryAustralia  -2.6710     2.6975  -0.990  0.32248   
## countryBelgium     1.9673     4.6162   0.426  0.67013   
## countryDenmark    -4.9104     3.1565  -1.556  0.12030   
## countryGermany     5.1522     3.3357   1.545  0.12296   
## countryGreece     -2.7547     3.0787  -0.895  0.37126   
## countryOther       1.6503     3.0943   0.533  0.59400   
## countrySpain      -1.1243     3.3652  -0.334  0.73841   
## countryThailand   -4.1520     2.6743  -1.553  0.12104   
## countryUK         -2.3741     2.9263  -0.811  0.41750   
## countryUSA        -1.8117     2.9810  -0.608  0.54358   
## seasonNH.10.11     5.0602     1.6700   3.030  0.00255 **
## seasonNH.11.12    -3.1345     4.5065  -0.696  0.48698   
## seasonNH.12.13    -4.1199     3.3786  -1.219  0.22316   
## seasonNH.13.14     1.0435     1.8370   0.568  0.57020   
## seasonNH.14.15     1.9294     3.5393   0.545  0.58585   
## seasonNH.15.16     0.2425     1.5790   0.154  0.87798   
## seasonNH.16.17    -5.4515     4.0721  -1.339  0.18114   
## seasonSH.10        4.6119     3.5179   1.311  0.19034   
## seasonSH.11        1.7367     3.5131   0.494  0.62123   
## seasonSH.12       -3.0302     5.2846  -0.573  0.56658   
## seasonSH.13       -1.0079     3.5518  -0.284  0.77669   
## seasonSH.14        8.1922     3.9675   2.065  0.03935 * 
## seasonSH.15        1.0612     5.1690   0.205  0.83741   
## seasonSH.16       -0.8627     2.5379  -0.340  0.73403   
## seasonSH.17       -6.5325     6.6345  -0.985  0.32519   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 122.8775)
## 
##     Null deviance: 84839  on 648  degrees of freedom
## Residual deviance: 76184  on 620  degrees of freedom
## AIC: 4994.6
## 
## Number of Fisher Scoring iterations: 2
## H1N1 fit to age only, plus fixed effects of vaccination, av use, underlying conditions, country and season
fit1 = glm(hospdays ~ ns(age, knots = 65) + anyvac + anyav + anydx + country + season, data = train.H1N1)
summary(fit1)
## 
## Call:
## glm(formula = hospdays ~ ns(age, knots = 65) + anyvac + anyav + 
##     anydx + country + season, data = train.H1N1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -18.026   -5.409   -2.689    1.186   90.596  
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)   
## (Intercept)            2.1570     3.4560   0.624  0.53277   
## ns(age, knots = 65)1   9.0915     3.0852   2.947  0.00333 **
## ns(age, knots = 65)2   3.6643     2.7847   1.316  0.18871   
## anyvac1               -2.2952     1.1092  -2.069  0.03894 * 
## anyav1                 2.0316     1.2121   1.676  0.09421 . 
## anydx1                 1.4890     1.0911   1.365  0.17287   
## countryAustralia      -1.8932     2.7184  -0.696  0.48642   
## countryBelgium         3.4779     4.6327   0.751  0.45310   
## countryDenmark        -3.6158     3.1817  -1.136  0.25621   
## countryGermany         6.6412     3.3539   1.980  0.04813 * 
## countryGreece         -2.5039     3.0698  -0.816  0.41502   
## countryOther           2.3142     3.0955   0.748  0.45500   
## countrySpain          -0.4583     3.3667  -0.136  0.89178   
## countryThailand       -3.6032     2.6702  -1.349  0.17771   
## countryUK             -1.0510     2.9502  -0.356  0.72177   
## countryUSA            -0.9344     2.9900  -0.313  0.75476   
## seasonNH.10.11         4.5327     1.6688   2.716  0.00679 **
## seasonNH.11.12        -2.0166     4.5005  -0.448  0.65425   
## seasonNH.12.13        -5.1089     3.3737  -1.514  0.13045   
## seasonNH.13.14         0.8928     1.8294   0.488  0.62571   
## seasonNH.14.15         1.1918     3.5591   0.335  0.73783   
## seasonNH.15.16        -0.5854     1.5953  -0.367  0.71379   
## seasonNH.16.17        -5.6394     4.0486  -1.393  0.16415   
## seasonSH.10            5.5224     3.5105   1.573  0.11620   
## seasonSH.11            2.1123     3.4972   0.604  0.54606   
## seasonSH.12           -1.6833     5.2763  -0.319  0.74981   
## seasonSH.13           -0.2649     3.5419  -0.075  0.94040   
## seasonSH.14            7.6254     3.9534   1.929  0.05421 . 
## seasonSH.15            0.5029     5.1459   0.098  0.92217   
## seasonSH.16           -1.2036     2.5256  -0.477  0.63384   
## seasonSH.17           -7.2836     6.5999  -1.104  0.27020   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 121.4069)
## 
##     Null deviance: 84839  on 648  degrees of freedom
## Residual deviance: 75029  on 618  degrees of freedom
## AIC: 4988.7
## 
## Number of Fisher Scoring iterations: 2
## H1N1 fit to age plus imprinting, plus fixed effects of vaccination, av use, underlying conditions, country and season
fit2 = glm(hospdays ~ ns(age, knots = 65) + p.g1.protection + anyvac + anyav + anydx + country + season, data = train.H1N1)
summary(fit2)
## 
## Call:
## glm(formula = hospdays ~ ns(age, knots = 65) + p.g1.protection + 
##     anyvac + anyav + anydx + country + season, data = train.H1N1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -17.881   -5.335   -2.786    1.361   90.120  
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)   
## (Intercept)            2.0872     3.4544   0.604  0.54592   
## ns(age, knots = 65)1   4.8263     4.4761   1.078  0.28135   
## ns(age, knots = 65)2   1.7606     3.1374   0.561  0.57489   
## p.g1.protection        2.3671     1.8008   1.314  0.18918   
## anyvac1               -2.3115     1.1086  -2.085  0.03747 * 
## anyav1                 2.0833     1.2120   1.719  0.08612 . 
## anydx1                 1.4358     1.0913   1.316  0.18875   
## countryAustralia      -1.9683     2.7174  -0.724  0.46914   
## countryBelgium         3.2724     4.6326   0.706  0.48022   
## countryDenmark        -3.6001     3.1798  -1.132  0.25800   
## countryGermany         6.6402     3.3519   1.981  0.04803 * 
## countryGreece         -2.6718     3.0707  -0.870  0.38459   
## countryOther           2.2171     3.0946   0.716  0.47400   
## countrySpain          -0.3318     3.3661  -0.099  0.92152   
## countryThailand       -3.5277     2.6693  -1.322  0.18679   
## countryUK             -1.2312     2.9517  -0.417  0.67674   
## countryUSA            -1.1291     2.9920  -0.377  0.70601   
## seasonNH.10.11         4.5969     1.6685   2.755  0.00604 **
## seasonNH.11.12        -2.3124     4.5035  -0.513  0.60780   
## seasonNH.12.13        -4.8080     3.3795  -1.423  0.15533   
## seasonNH.13.14         1.1754     1.8410   0.638  0.52341   
## seasonNH.14.15         1.4174     3.5611   0.398  0.69076   
## seasonNH.15.16        -0.2263     1.6176  -0.140  0.88880   
## seasonNH.16.17        -5.7337     4.0469  -1.417  0.15704   
## seasonSH.10            5.4131     3.5094   1.542  0.12347   
## seasonSH.11            2.3277     3.4989   0.665  0.50614   
## seasonSH.12           -1.8593     5.2749  -0.352  0.72459   
## seasonSH.13           -0.3346     3.5402  -0.095  0.92472   
## seasonSH.14            7.8071     3.9535   1.975  0.04874 * 
## seasonSH.15            0.5883     5.1433   0.114  0.90897   
## seasonSH.16           -1.0114     2.5283  -0.400  0.68928   
## seasonSH.17           -7.2674     6.5960  -1.102  0.27098   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 121.2641)
## 
##     Null deviance: 84839  on 648  degrees of freedom
## Residual deviance: 74820  on 617  degrees of freedom
## AIC: 4988.8
## 
## Number of Fisher Scoring iterations: 2
## H1N1 fit to age only, plus fixed effects of vaccination, av use, underlying conditions NO COUNTRY AND SEASON
fit3 = glm(hospdays ~ ns(age, knots = 65) + anyvac + anyav + anydx, data = train.H1N1)
summary(fit3)
## 
## Call:
## glm(formula = hospdays ~ ns(age, knots = 65) + anyvac + anyav + 
##     anydx, data = train.H1N1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -10.629   -5.858   -3.551    0.348   90.087  
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)   
## (Intercept)             2.340      1.645   1.422  0.15542   
## ns(age, knots = 65)1    6.612      2.921   2.263  0.02396 * 
## ns(age, knots = 65)2    2.540      2.664   0.954  0.34064   
## anyvac1                -2.496      1.050  -2.378  0.01771 * 
## anyav1                  1.473      1.192   1.235  0.21712   
## anydx1                  2.777      1.063   2.614  0.00917 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 127.8588)
## 
##     Null deviance: 84839  on 648  degrees of freedom
## Residual deviance: 82213  on 643  degrees of freedom
## AIC: 4998
## 
## Number of Fisher Scoring iterations: 2
## H1N1 fit to age plus imprinting, plus fixed effects of vaccination, av use, underlying conditions, NO COUNTRY AND SEASON
fit4 = glm(hospdays ~ ns(age, knots = 65) + p.g1.protection + anyvac + anyav + anydx, data = train.H1N1)
summary(fit4)
## 
## Call:
## glm(formula = hospdays ~ ns(age, knots = 65) + p.g1.protection + 
##     anyvac + anyav + anydx, data = train.H1N1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -10.344   -5.924   -3.391    0.312   89.489  
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)  
## (Intercept)            2.2786     1.6426   1.387   0.1659  
## ns(age, knots = 65)1   1.1296     4.2243   0.267   0.7893  
## ns(age, knots = 65)2   0.1362     2.9776   0.046   0.9635  
## p.g1.protection        3.1891     1.7777   1.794   0.0733 .
## anyvac1               -2.5473     1.0482  -2.430   0.0154 *
## anyav1                 1.5904     1.1920   1.334   0.1826  
## anydx1                 2.6530     1.0631   2.496   0.0128 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 127.4193)
## 
##     Null deviance: 84839  on 648  degrees of freedom
## Residual deviance: 81803  on 642  degrees of freedom
## AIC: 4996.8
## 
## Number of Fisher Scoring iterations: 2
## Plot the fits for each model side by side
par(mfrow = c(1, 2), las = 3, mar = c(6, 6, 6, 3), cex.axis = .7)
library(gam)
## Loading required package: foreach
## Loaded gam 1.14-4
plot.gam(fit00, se = T, col = 'dodgerblue', main = 'Simplified Baseline')

plot.gam(fit0, se = T, col = 'dodgerblue', main = 'Baseline')

plot.gam(fit1, se = T, col = 'dodgerblue', main = 'Baseline + age + imp')

plot.gam(fit2, se = T, col = 'dodgerblue', main = 'Baseline + age + imp')

plot.gam(fit3, se = T, col = 'dodgerblue', main = 'Simple baseline + age + imp')

plot.gam(fit4, se = T, col = 'dodgerblue', main = 'Simple baseline + age + imp')

pdf('003Hospdays_H1N1.pdf')
plot.gam(fit4, se = T, col = 'dodgerblue', main = 'Simple baseline + age + imp')
dev.off()
## quartz_off_screen 
##                 2
# Predict probs for each patient
pred00 = predict(fit00, newdata = (test.H1N1))
pred0 = predict(fit0, newdata = (test.H1N1))
pred1 = predict(fit1, newdata = (test.H1N1))
pred2 = predict(fit2, newdata = (test.H1N1))
pred3 = predict(fit3, newdata = (test.H1N1))
pred4 = predict(fit4, newdata = (test.H1N1))
  
# Estimate the simulated error rate
MSE = numeric(6); names(MSE) = c('simple.baseline', 'baseline', 'baeline+age', 'baseline+age+imp', 'simple.baseline+age', 'simple.baseline+age+imp')
MSE[1] = mean((pred00 - test.H1N1$hospdays)^2)
MSE[2] = mean((pred0 - test.H1N1$hospdays)^2)
MSE[3] = mean((pred1 - test.H1N1$hospdays)^2)
MSE[4] = mean((pred2 - test.H1N1$hospdays)^2)
MSE[5] = mean((pred3 - test.H1N1$hospdays)^2)
MSE[6] = mean((pred4 - test.H1N1$hospdays)^2)
sort(MSE)
##     simple.baseline+age simple.baseline+age+imp         simple.baseline 
##                171.9364                173.1208                174.8718 
##             baeline+age        baseline+age+imp                baseline 
##                176.6496                177.7045                181.5595
AIC = c(fit00$aic, fit0$aic, fit1$aic, fit2$aic, fit3$aic, fit4$aic); names(AIC) =  c('simple.baseline', 'baseline', 'baeline+age', 'baseline+age+imp', 'simple.baseline+age', 'simple.baseline+age+imp')
del.AIC = sort(AIC - min(AIC))
del.AIC
##             baeline+age        baseline+age+imp                baseline 
##               0.0000000               0.1851009               5.9113672 
## simple.baseline+age+imp     simple.baseline+age         simple.baseline 
##               8.0967682               9.3417324              11.0958038
load('master.AIC.table.H1N1.RData')
master.AIC.table['003Hospdays',  c('simple.baseline', 'baseline', 'baseline+age', 'baseline+age+imp', 'simple.baseline+age', 'simple.baseline+age+imp')] = AIC - min(AIC)
save(master.AIC.table, file = 'master.AIC.table.H1N1.RData')
rm(master.AIC.table)

Clean H3N2 data for fitting

## integer(0)
## integer(0)
## integer(0)
## integer(0)
## integer(0)
## integer(0)
## integer(0)
## 
## Argentina Australia   Belgium     China   Denmark   Germany    Greece 
##        97       120         3         7        26        19        24 
##      Peru Singapore     Spain  Thailand        UK       USA 
##         9        21        29       120        89       174
## [1] 738
library(splines)
## H3N2 fit to medical history only (no effect of country, season, age or imprinting)
fit00 = glm(hospdays ~ anyvac + anyav + anydx, data = train.H3N2)
summary(fit00)
## 
## Call:
## glm(formula = hospdays ~ anyvac + anyav + anydx, data = train.H3N2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
##  -8.846   -5.869   -2.869    0.511  139.335  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   4.4656     1.5053   2.967  0.00312 **
## anyvac1       0.2035     0.9361   0.217  0.82797   
## anyav1       -0.9769     1.1727  -0.833  0.40511   
## anydx1        4.1767     1.2831   3.255  0.00119 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 134.0292)
## 
##     Null deviance: 90273  on 664  degrees of freedom
## Residual deviance: 88593  on 661  degrees of freedom
## AIC: 5150.4
## 
## Number of Fisher Scoring iterations: 2
## H3N2 fit to country, season and medical history only (no effect of age or imprinting)  
fit0 = glm(hospdays ~ anyvac + anyav + anydx + country + season, data = train.H3N2)
summary(fit0)
## 
## Call:
## glm(formula = hospdays ~ anyvac + anyav + anydx + country + season, 
##     data = train.H3N2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -15.286   -5.789   -3.044    1.157  135.644  
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)   
## (Intercept)       1.87186    4.91067   0.381  0.70319   
## anyvac1           0.32796    1.08242   0.303  0.76200   
## anyav1           -0.01256    1.25298  -0.010  0.99201   
## anydx1            3.96803    1.34685   2.946  0.00334 **
## countryAustralia -1.60759    2.06883  -0.777  0.43742   
## countryBelgium   -3.26689    8.56617  -0.381  0.70305   
## countryDenmark    0.85676    3.47023   0.247  0.80507   
## countryGreece     2.45624    3.45284   0.711  0.47712   
## countryOther      1.36542    3.12273   0.437  0.66208   
## countrySingapore -7.96181    3.19157  -2.495  0.01286 * 
## countrySpain     -0.16994    3.28776  -0.052  0.95879   
## countryThailand  -3.78205    2.22425  -1.700  0.08955 . 
## countryUK        -1.74996    2.61535  -0.669  0.50367   
## countryUSA       -4.16761    2.54615  -1.637  0.10216   
## seasonNH.11.12    2.90061    4.60199   0.630  0.52873   
## seasonNH.12.13    4.05637    4.10030   0.989  0.32290   
## seasonNH.13.14    2.79077    4.46038   0.626  0.53175   
## seasonNH.14.15    5.06027    4.04882   1.250  0.21183   
## seasonNH.15.16    2.75674    4.76703   0.578  0.56327   
## seasonNH.16.17    2.40125    4.12000   0.583  0.56021   
## seasonSH.10      17.03451    7.12019   2.392  0.01703 * 
## seasonSH.11       3.80482    5.46660   0.696  0.48667   
## seasonSH.12       3.32440    5.00129   0.665  0.50648   
## seasonSH.13       2.60350    5.26530   0.494  0.62115   
## seasonSH.14       4.78322    4.66593   1.025  0.30569   
## seasonSH.15       3.62369    4.50804   0.804  0.42180   
## seasonSH.16       5.02466    4.54540   1.105  0.26939   
## seasonSH.17       5.52862    4.68474   1.180  0.23839   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 131.7959)
## 
##     Null deviance: 90273  on 664  degrees of freedom
## Residual deviance: 83954  on 637  degrees of freedom
## AIC: 5162.6
## 
## Number of Fisher Scoring iterations: 2
## H3N2 fit to age only, plus fixed effects of vaccination, av use, underlying conditions, country and season
fit1 = glm(hospdays ~ ns(age, knots = 65) + anyvac + anyav + anydx + country + season, data = train.H3N2)
summary(fit1)
## 
## Call:
## glm(formula = hospdays ~ ns(age, knots = 65) + anyvac + anyav + 
##     anydx + country + season, data = train.H3N2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -14.529   -5.682   -2.864    1.157  135.440  
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)  
## (Intercept)           0.54929    5.00216   0.110   0.9126  
## ns(age, knots = 65)1  4.43773    3.49309   1.270   0.2044  
## ns(age, knots = 65)2  2.58926    1.91507   1.352   0.1768  
## anyvac1              -0.01318    1.09988  -0.012   0.9904  
## anyav1               -0.02329    1.25255  -0.019   0.9852  
## anydx1                3.08884    1.45323   2.126   0.0339 *
## countryAustralia     -1.31527    2.07403  -0.634   0.5262  
## countryBelgium       -3.19438    8.55691  -0.373   0.7090  
## countryDenmark        1.20075    3.47199   0.346   0.7296  
## countryGreece         2.48421    3.44904   0.720   0.4716  
## countryOther          1.47824    3.11986   0.474   0.6358  
## countrySingapore     -7.53125    3.21253  -2.344   0.0194 *
## countrySpain         -0.04603    3.28565  -0.014   0.9888  
## countryThailand      -3.65539    2.22341  -1.644   0.1007  
## countryUK            -1.14258    2.63348  -0.434   0.6645  
## countryUSA           -3.43398    2.57471  -1.334   0.1828  
## seasonNH.11.12        2.67543    4.60754   0.581   0.5617  
## seasonNH.12.13        3.74415    4.11023   0.911   0.3627  
## seasonNH.13.14        2.58994    4.46203   0.580   0.5618  
## seasonNH.14.15        4.87533    4.05510   1.202   0.2297  
## seasonNH.15.16        2.47911    4.76432   0.520   0.6030  
## seasonNH.16.17        2.25768    4.12439   0.547   0.5843  
## seasonSH.10          16.16262    7.12952   2.267   0.0237 *
## seasonSH.11           4.18134    5.46665   0.765   0.4446  
## seasonSH.12           3.02691    5.00638   0.605   0.5457  
## seasonSH.13           2.30984    5.27844   0.438   0.6618  
## seasonSH.14           4.84586    4.66267   1.039   0.2991  
## seasonSH.15           3.53934    4.50396   0.786   0.4323  
## seasonSH.16           4.80866    4.54368   1.058   0.2903  
## seasonSH.17           5.52250    4.68314   1.179   0.2387  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 131.5035)
## 
##     Null deviance: 90273  on 664  degrees of freedom
## Residual deviance: 83505  on 635  degrees of freedom
## AIC: 5163
## 
## Number of Fisher Scoring iterations: 2
## H3N2 fit to age plus imprinting, plus fixed effects of vaccination, av use, underlying conditions, country and season
fit2 = glm(hospdays ~ ns(age, knots = 65) + p.g2.protection + anyvac + anyav + anydx + country + season, data = train.H3N2)
summary(fit2)
## 
## Call:
## glm(formula = hospdays ~ ns(age, knots = 65) + p.g2.protection + 
##     anyvac + anyav + anydx + country + season, data = train.H3N2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -14.429   -5.650   -2.712    1.280  135.283  
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)  
## (Intercept)           3.23457    5.52487   0.585   0.5584  
## ns(age, knots = 65)1 -0.35194    5.45352  -0.065   0.9486  
## ns(age, knots = 65)2  1.07055    2.33017   0.459   0.6461  
## p.g2.protection      -2.81326    2.46026  -1.143   0.2533  
## anyvac1              -0.05271    1.10015  -0.048   0.9618  
## anyav1               -0.15717    1.25771  -0.125   0.9006  
## anydx1                3.15385    1.45399   2.169   0.0304 *
## countryAustralia     -1.29043    2.07364  -0.622   0.5340  
## countryBelgium       -2.47299    8.57807  -0.288   0.7732  
## countryDenmark        1.18684    3.47118   0.342   0.7325  
## countryGreece         2.36692    3.44973   0.686   0.4929  
## countryOther          1.24362    3.12584   0.398   0.6909  
## countrySingapore     -7.74596    3.21723  -2.408   0.0163 *
## countrySpain         -0.17161    3.28669  -0.052   0.9584  
## countryThailand      -3.76004    2.22475  -1.690   0.0915 .
## countryUK            -1.20828    2.63347  -0.459   0.6465  
## countryUSA           -3.43951    2.57409  -1.336   0.1820  
## seasonNH.11.12        2.79283    4.60756   0.606   0.5446  
## seasonNH.12.13        3.92670    4.11234   0.955   0.3400  
## seasonNH.13.14        2.83014    4.46589   0.634   0.5265  
## seasonNH.14.15        5.17402    4.06252   1.274   0.2033  
## seasonNH.15.16        2.69096    4.76677   0.565   0.5726  
## seasonNH.16.17        2.60754    4.13473   0.631   0.5285  
## seasonSH.10          16.56039    7.13628   2.321   0.0206 *
## seasonSH.11           4.66232    5.48149   0.851   0.3953  
## seasonSH.12           2.95647    5.00555   0.591   0.5550  
## seasonSH.13           2.32712    5.27718   0.441   0.6594  
## seasonSH.14           4.82949    4.66156   1.036   0.3006  
## seasonSH.15           3.57223    4.50297   0.793   0.4279  
## seasonSH.16           4.99747    4.54558   1.099   0.2720  
## seasonSH.17           5.72480    4.68535   1.222   0.2222  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 131.4398)
## 
##     Null deviance: 90273  on 664  degrees of freedom
## Residual deviance: 83333  on 634  degrees of freedom
## AIC: 5163.7
## 
## Number of Fisher Scoring iterations: 2
## H3N2 fit to age only, plus fixed effects of vaccination, av use, underlying conditions NO COUNTRY AND SEASON
fit3 = glm(hospdays ~ ns(age, knots = 65) + anyvac + anyav + anydx, data = train.H3N2)
summary(fit3)
## 
## Call:
## glm(formula = hospdays ~ ns(age, knots = 65) + anyvac + anyav + 
##     anydx, data = train.H3N2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -10.946   -5.757   -2.958    0.677  139.007  
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)  
## (Intercept)           3.18653    1.87100   1.703   0.0890 .
## ns(age, knots = 65)1  4.78461    3.43792   1.392   0.1645  
## ns(age, knots = 65)2  4.20223    1.83462   2.291   0.0223 *
## anyvac1              -0.04308    0.93732  -0.046   0.9634  
## anyav1               -0.95766    1.16866  -0.819   0.4128  
## anydx1                3.20425    1.38096   2.320   0.0206 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 132.976)
## 
##     Null deviance: 90273  on 664  degrees of freedom
## Residual deviance: 87631  on 659  degrees of freedom
## AIC: 5147.1
## 
## Number of Fisher Scoring iterations: 2
## H3N2 fit to age plus imprinting, plus fixed effects of vaccination, av use, underlying conditions, NO COUNTRY AND SEASON
fit4 = glm(hospdays ~ ns(age, knots = 65) + p.g2.protection + anyvac + anyav + anydx, data = train.H3N2)
summary(fit4)
## 
## Call:
## glm(formula = hospdays ~ ns(age, knots = 65) + p.g2.protection + 
##     anyvac + anyav + anydx, data = train.H3N2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -10.889   -5.759   -3.029    0.867  138.921  
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)  
## (Intercept)           5.48083    3.02042   1.815   0.0700 .
## ns(age, knots = 65)1  0.81658    5.35135   0.153   0.8788  
## ns(age, knots = 65)2  2.90823    2.27036   1.281   0.2007  
## p.g2.protection      -2.31650    2.39401  -0.968   0.3336  
## anyvac1              -0.04439    0.93737  -0.047   0.9622  
## anyav1               -1.03393    1.17137  -0.883   0.3777  
## anydx1                3.27513    1.38297   2.368   0.0182 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 132.9888)
## 
##     Null deviance: 90273  on 664  degrees of freedom
## Residual deviance: 87507  on 658  degrees of freedom
## AIC: 5148.2
## 
## Number of Fisher Scoring iterations: 2
## Plot the fits for each model side by side
par(mfrow = c(1, 2), las = 3, mar = c(6, 6, 6, 3), cex.axis = .7)
library(gam)
plot.gam(fit00, se = T, col = 'firebrick1', main = 'Simplified Baseline')

plot.gam(fit0, se = T, col = 'firebrick1', main = 'Baseline')

plot.gam(fit1, se = T, col = 'firebrick1', main = 'Baseline + age + imp')

plot.gam(fit2, se = T, col = 'firebrick1', main = 'Baseline + age + imp')

plot.gam(fit3, se = T, col = 'firebrick1', main = 'Simple baseline + age + imp')

plot.gam(fit4, se = T, col = 'firebrick1', main = 'Simple baseline + age + imp')

pdf('003Hospdays_H3N2.pdf')
plot.gam(fit4, se = T, col = 'firebrick1', main = 'Simple baseline + age + imp')
dev.off()
## quartz_off_screen 
##                 2
# Predict probs for each patient
pred00 = predict(fit00, newdata = test.H3N2)
pred0 = predict(fit0, newdata = test.H3N2)
pred1 = predict(fit1, newdata = test.H3N2)
pred2 = predict(fit2, newdata = test.H3N2)
pred3 = predict(fit3, newdata = test.H3N2)
pred4 = predict(fit4, newdata = test.H3N2)
  
# Estimate the simulated error rate
MSE = numeric(6); names(MSE) = c('simple.baseline', 'baseline', 'baeline+age', 'baseline+age+imp', 'simple.baseline+age', 'simple.baseline+age+imp')
MSE[1] = mean((pred00 - test.H3N2$hospdays)^2)
MSE[2] = mean((pred0 - test.H3N2$hospdays)^2)
MSE[3] = mean((pred1 - test.H3N2$hospdays)^2)
MSE[4] = mean((pred2 - test.H3N2$hospdays)^2)
MSE[5] = mean((pred3 - test.H3N2$hospdays)^2)
MSE[6] = mean((pred4 - test.H3N2$hospdays)^2)
sort(MSE)
##             baeline+age        baseline+age+imp                baseline 
##                107.1434                107.5259                107.5507 
##     simple.baseline+age         simple.baseline simple.baseline+age+imp 
##                109.5733                109.9962                109.9967
AIC = c(fit00$aic, fit0$aic, fit1$aic, fit2$aic, fit3$aic, fit4$aic); names(AIC) =  c('simple.baseline', 'baseline', 'baseline+age', 'baseline+age+imp', 'simple.baseline+age', 'simple.baseline+age+imp')
del.AIC = sort(AIC - min(AIC))
del.AIC
##     simple.baseline+age simple.baseline+age+imp         simple.baseline 
##                0.000000                1.054417                3.261739 
##                baseline            baseline+age        baseline+age+imp 
##               15.492766               15.924631               16.554570
load('master.AIC.table.H3N2.RData')
master.AIC.table['003Hospdays',  c('simple.baseline', 'baseline', 'baseline+age', 'baseline+age+imp', 'simple.baseline+age', 'simple.baseline+age+imp')] = AIC - min(AIC)
save(master.AIC.table, file = 'master.AIC.table.H3N2.RData')
rm(master.AIC.table)